##########################################################################################################################################
###### This file estimates the HHI for the different industries
###############################
# The dataframes saved here are later used in the main analyses
#######################################
# There are other code files that are used for data preparation
# This processed data are saved and loaded in this code


#Compustat Concentration Ratios (Herfindahl Index:)

#following Grullon et al. (2019) - Are US Industries Becoming More Concentrated?

#Use the NAICS definition of Company Codes: --> Look at the Online Appendix for further information:


#NAICS: 6 digit Code --> first two digits: designates the industry sector
#                    --> third digits designates the subsector
#                    --> fourth digit designates a industry group

#Grullon et al. (2019) use the three digit specification (reasoning as well given in the Online Appendix)

#The following process is used to fill NAICS Codes:

# (1) Use Compustat historical NAICS (NAICSH) whenever available
# (2) If (1) is not availabe, use CRSP historical value (NAICS from msenames table)
# (3) If neither (1) or (2) is available use NAICS from Compustats names table
# (4) If none of the previous values are available, convert SIC codes to NAICS with the U.S. Census Bureau conversion table

#######
# this is a Compustat file containing the following elements:
# gvkey,datadate,fyear,indfmt,consol,popsrc,datafmt,tic,cusip,conm,curcd,sale,exchg,cik,costat,naicsh,naics,sic
Compustat = read.csv("0a4a9373fde91d3a.csv", header=T)

######
# this is the Link Table from the CRSP-Compustat Merged database
# it contains the elements: gvkey,LPERMNO,LINKDT,LINKENDDT
Link_Table = read.table("LinkTable.txt", header=T, sep=",")
#####

######
# this is a datafile from CRSP containing the following elements:
# PERMNO	date	SICCD	TICKER	COMNAM	NAICS	PERMCO	CUSIP	PRC
CRSP = read.table("ac2f35cda7948e58.txt",header=T,sep="\t")

library(stringi)
#First we can remove all companies without any sales data:

Compustat = Compustat[!is.na(Compustat$sale),]

Compustat$datadate = as.Date(as.character(Compustat$datadate),format="%Y%m%d")
#This is why he starts at 1972, before we barely have any data:
table(format(Compustat$datadate,"%Y"))
#Just match the time of Grullon et al. (2019)
Compustat = Compustat[Compustat$datadate>"1972-01-01",]

#Use NAICSH whenever available, create a three digit representation of the NAICS and a four digit one:

Compustat$NAICS_3 = as.numeric(unlist(lapply(strsplit(as.character(Compustat$naicsh),split=""),function(x) stri_paste(head(x,n=3),collapse=""))))
Compustat$NAICS_4 = as.numeric(unlist(lapply(strsplit(as.character(Compustat$naicsh),split=""),function(x) stri_paste(head(x,n=4),collapse=""))))
#If this is NA, then use CRSP NAICS

#Subset of the three digit Code, which is NA:
NA_Subset = Compustat[is.na(Compustat$NAICS_3),]
#The Link Table has multiple LPERMNOs with one gvkey:
#So also match them within the time-frame defined by LINKDT and LINKENDDT

NA_Subset$PERMNO = rep(NA,nrow(NA_Subset))
for(k in which(!duplicated(NA_Subset$gvkey))){
  if(length(Link_Table$LINKENDDT[Link_Table$gvkey %in% NA_Subset$gvkey[k]])!=0){

  for(j in 1:length(Link_Table$LINKENDDT[Link_Table$gvkey %in% NA_Subset$gvkey[k]])){
    Temp = NA_Subset[NA_Subset$gvkey == NA_Subset$gvkey[k],]
    Temp_Seq = c()
  if(is.na(as.numeric(format(as.Date(as.character(Link_Table$LINKENDDT[Link_Table$gvkey %in% NA_Subset$gvkey[k]][j]),format="%Y%m%d"),"%Y")))){
    Temp_Seq = seq(from = as.numeric(format(as.Date(as.character(Link_Table$LINKDT[Link_Table$gvkey %in% NA_Subset$gvkey[k]][j]),format="%Y%m%d"),"%Y")),
                                    to = 2020)
    Temp_Seq = unique(Temp_Seq)
    Temp_Indik = Temp$fyear %in% Temp_Seq
    Temp = Temp[Temp$fyear %in% Temp_Seq,] 
    #
    
    NA_Subset[NA_Subset$gvkey == NA_Subset$gvkey[k],]$PERMNO[Temp_Indik] = rep(Link_Table$LPERMNO[Link_Table$gvkey %in% NA_Subset$gvkey[k]][j],nrow(Temp))

  }else{
    Temp_Seq = seq(from = as.numeric(format(as.Date(as.character(Link_Table$LINKDT[Link_Table$gvkey %in% NA_Subset$gvkey[k]][j]),format="%Y%m%d"),"%Y")),
                                    to = as.numeric(format(as.Date(as.character(Link_Table$LINKENDDT[Link_Table$gvkey %in% NA_Subset$gvkey[k]][j]),format="%Y%m%d"),"%Y")))

    Temp_Seq = unique(Temp_Seq)
    Temp_Indik = Temp$fyear %in% Temp_Seq
    Temp = Temp[Temp$fyear %in% Temp_Seq,] 
    #
    
    NA_Subset[NA_Subset$gvkey == NA_Subset$gvkey[k],]$PERMNO[Temp_Indik] = rep(Link_Table$LPERMNO[Link_Table$gvkey %in% NA_Subset$gvkey[k]][j],nrow(Temp))
    }
  }
  }
}


#The NAICS of the CRSP data is not NA at some point in the future, depending on the year, so we must first 
#compile the CRSP data by company and add two columns telling us from when until when the NAICS was available
CRSP = CRSP[!is.na(CRSP$NAICS),]
#Now we simply need to figure out, when it starts to be not NA per company, just look at this every December
#since the Compustat data is annually
CRSP$date = as.Date(as.character(CRSP$date),format="%Y%m%d") 
CRSP = CRSP[format(CRSP$date,"%m")=="12",]

#Create a new dataframe with a PERMNO, Start and End Date, need nothing else so far.
#At Max we have 6 unique NAICS Codes per company!
CRSP_NAICS = data.frame(PERMNO = unique(CRSP$PERMNO), Start = rep(NA,length(unique(CRSP$PERMNO))), 
                        End = rep(NA,length(unique(CRSP$PERMNO))))

for(j in 1:nrow(CRSP_NAICS)){
  Temp = CRSP[CRSP$PERMNO==CRSP_NAICS$PERMNO[j],]
  CRSP_NAICS$Start[j] = format(Temp$date,"%Y")[1]
  CRSP_NAICS$End[j] = format(Temp$date,"%Y")[nrow(Temp)]
}


for(i in 1:nrow(CRSP_NAICS)){
  #If this PERMNO is as well included in the subset which is NA, so has no NAICSH
  if(sum(!is.na(NA_Subset[CRSP_NAICS$PERMNO[i]==NA_Subset$PERMNO,]))==0){
    
  }else{
  Temp = NA_Subset[!is.na(NA_Subset$PERMNO) & NA_Subset$PERMNO == CRSP_NAICS$PERMNO[i],]
  Interm = CRSP[CRSP$PERMNO==unique(Temp$PERMNO) & format(CRSP$date,"%Y") %in% Temp$fyear,]
  #If Interm is NULL, we have no CRSP observations that match the NA subset from Compustat in year and PERMNO
  if(nrow(Interm) == 0){}else{
  Temp$NAICS_3 = as.numeric(unlist(lapply(strsplit(as.character(Interm$NAICS),split=""),function(x) stri_paste(head(x,n=3),collapse=""))))
  Temp$NAICS_4 = as.numeric(unlist(lapply(strsplit(as.character(Interm$NAICS),split=""),function(x) stri_paste(head(x,n=4),collapse=""))))
  
  NA_Subset[!is.na(NA_Subset$PERMNO) & NA_Subset$PERMNO == CRSP_NAICS$PERMNO[i],] = Temp
  }
  }
}
#Seems still to be working correctly (...)
which(CRSP_NAICS$PERMNO%in%NA_Subset$PERMNO)

#Now use the Compustat NAISC (naics)
NA_Subset[is.na(NA_Subset$NAICS_3),]$NAICS_3 = as.numeric(unlist(lapply(strsplit(as.character(NA_Subset[is.na(NA_Subset$NAICS_3),]$naics),split=""),function(x) stri_paste(head(x,n=3),collapse=""))))
NA_Subset[is.na(NA_Subset$NAICS_4),]$NAICS_4 = as.numeric(unlist(lapply(strsplit(as.character(NA_Subset[is.na(NA_Subset$NAICS_4),]$naics),split=""),function(x) stri_paste(head(x,n=4),collapse=""))))

#Delete all 4 digit NAICS that are not 4 digits:
NA_Subset$NAICS_3[NA_Subset$NAICS_3<100] = NA
NA_Subset$NAICS_4[NA_Subset$NAICS_4<1000] = NA


NA_Subset[is.na(NA_Subset$NAICS_3),]
##############################
# These files contain the SIC to NAICS conversion
# the files contain the following information:
# SIC,	Part Indicator,	SIC Titles and Part Descriptions,	1997 NAICS,	1997 NAICS Titles and Part Indicators
#SIC 1987 to 1997 NAICS
SIC_NAICS97 <- read.table("1987_SIC_to_1997_NAICS.txt", header=T, sep="\t")
#1997 NAICS to 2002 NAICS
NAICS97_NAICS02 <- read.table("1997_NAICS_to_2002_NAICS.txt", header=T, sep="\t")
#2002 NAICS to 2007 NAICS
NAICS02_NAICS07 <- read.table("2002_to_2007_NAICS.txt", header=T, sep="\t")
#2007 NAICS to 2012 NAICS
NAICS07_NAICS12 <- read.table("2007_to_2012_NAICS.txt", header=T, sep="\t")
#2012 NAICS to 2017 NAICS
NAICS12_NAICS17 <- read.table("2012_to_2017_NAICS.txt", header=T, sep="\t")
###############

SIC_NAICS97 <- na.omit(SIC_NAICS97)
NAICS97_NAICS02 <- na.omit(NAICS97_NAICS02)
NAICS02_NAICS07 <- na.omit(NAICS02_NAICS07)

options(warn=2)

for(i in which(!duplicated(NA_Subset$sic))){
  Temp = NA_Subset[NA_Subset$sic == NA_Subset$sic[i] & is.na(NA_Subset$NAICS_3),]
  NAICS_Temp = c()
  if(length(SIC_NAICS97[SIC_NAICS97$SIC == NA_Subset$sic[i],]$X1997.NAICS)==0){}else{
  #Everything below 02 take this value:
  #can be more than one NAICS per SIC, pick the most NAICS
  if(length(SIC_NAICS97[SIC_NAICS97$SIC == NA_Subset$sic[i],]$X1997.NAICS)>1){
    library(pracma)
    NAICS_Temp = rep(SIC_NAICS97[SIC_NAICS97$SIC == NA_Subset$sic[i],]$X1997.NAICS[Mode(as.numeric(unlist(lapply(strsplit(as.character(SIC_NAICS97[SIC_NAICS97$SIC == NA_Subset$sic[i],]$X1997.NAICS),split=""),function(x) stri_paste(head(x,n=3),collapse=""))))) == 
    as.numeric(unlist(lapply(strsplit(as.character(SIC_NAICS97[SIC_NAICS97$SIC == NA_Subset$sic[i],]$X1997.NAICS),split=""),function(x) stri_paste(head(x,n=3),collapse=""))))][1], nrow(Temp))
  }else if (length(SIC_NAICS97[SIC_NAICS97$SIC == NA_Subset$sic[i],]$X1997.NAICS) == 1){
  NAICS_Temp = rep(SIC_NAICS97[SIC_NAICS97$SIC == NA_Subset$sic[i],]$X1997.NAICS, nrow(Temp))
  }
  #Update all values above 02:
  NAICS_Temp[Temp$fyear>=2002] = rep(NAICS97_NAICS02[NAICS97_NAICS02$NAICS97 %in% NAICS_Temp,]$NAICS02[1], nrow(Temp))[Temp$fyear>=2002]
  #Update all values above 07:
  NAICS_Temp[Temp$fyear>=2007] = rep(NAICS02_NAICS07[NAICS02_NAICS07$X2002.NAICS.Code %in% NAICS_Temp,]$X2007.NAICS.Code[1], nrow(Temp))[Temp$fyear>=2007]
  #Update all values above 12:
  NAICS_Temp[Temp$fyear>=2012] = rep(NAICS07_NAICS12[NAICS07_NAICS12$X2007.NAICS.Code %in% NAICS_Temp,]$X2012.NAICS.Code[1], nrow(Temp))[Temp$fyear>=2012]
  #Update all values above 17:
  NAICS_Temp[Temp$fyear>=2017] = rep(NAICS12_NAICS17[NAICS12_NAICS17$X2012.NAICS.Code %in% NAICS_Temp,]$X2017.NAICS.Code[1], nrow(Temp))[Temp$fyear>=2017]
  
  #Save the temporary NAICS:
  Temp$NAICS_3 = as.numeric(unlist(lapply(strsplit(as.character(NAICS_Temp),split=""),function(x) stri_paste(head(x,n=3),collapse=""))))
  Temp$NAICS_4 = as.numeric(unlist(lapply(strsplit(as.character(NAICS_Temp),split=""),function(x) stri_paste(head(x,n=4),collapse=""))))
  
  NA_Subset[NA_Subset$sic == NA_Subset$sic[i] & is.na(NA_Subset$NAICS_3),] = Temp
  }
}

options(warn=1)

#Remove the PERMNO:
NA_Subset$PERMNO = NULL

Compustat[is.na(Compustat$NAICS_3),] = NA_Subset

#Only the merged CRSP/Compustat Database, as in Grullon et al. (2019)
options(warn=2)
Compustat$PERMNO = rep(NA,nrow(Compustat))
for(k in which(!duplicated(Compustat$gvkey))){
  if(length(Link_Table$LINKENDDT[Link_Table$gvkey %in% Compustat$gvkey[k]])!=0){
    for(j in 1:length(Link_Table$LINKENDDT[Link_Table$gvkey %in% Compustat$gvkey[k]])){
      Temp = Compustat[Compustat$gvkey == Compustat$gvkey[k],]
      Temp_Seq = c()
      if(is.na(as.numeric(format(as.Date(as.character(Link_Table$LINKENDDT[Link_Table$gvkey %in% Compustat$gvkey[k]][j]),format="%Y%m%d"),"%Y")))){
        Temp_Seq = seq(from = as.numeric(format(as.Date(as.character(Link_Table$LINKDT[Link_Table$gvkey %in% Compustat$gvkey[k]][j]),format="%Y%m%d"),"%Y")),
                       to = 2020)
        Temp_Seq = unique(Temp_Seq)
        Temp_Indik = Temp$fyear %in% Temp_Seq
        Temp = Temp[Temp$fyear %in% Temp_Seq,] 
        #
        Compustat[Compustat$gvkey == Compustat$gvkey[k],]$PERMNO[Temp_Indik] = rep(Link_Table$LPERMNO[Link_Table$gvkey %in% Compustat$gvkey[k]][j],nrow(Temp))
      }else{
        Temp_Seq = seq(from = as.numeric(format(as.Date(as.character(Link_Table$LINKDT[Link_Table$gvkey %in% Compustat$gvkey[k]][j]),format="%Y%m%d"),"%Y")),
                       to = as.numeric(format(as.Date(as.character(Link_Table$LINKENDDT[Link_Table$gvkey %in% Compustat$gvkey[k]][j]),format="%Y%m%d"),"%Y")))
        Temp_Seq = unique(Temp_Seq)
        Temp_Indik = Temp$fyear %in% Temp_Seq
        Temp = Temp[Temp$fyear %in% Temp_Seq,] 
        #
        Compustat[Compustat$gvkey == Compustat$gvkey[k],]$PERMNO[Temp_Indik] = rep(Link_Table$LPERMNO[Link_Table$gvkey %in% Compustat$gvkey[k]][j],nrow(Temp))
      }
    }
  }
}
options(warn=1)
#Remove all without PERMNO and then remove the PERMNO

Compustat = Compustat[!is.na(Compustat$PERMNO),]
#Compustat$PERMNO = NULL

which(table(paste0(Compustat$datadate,Compustat$PERMNO))>1)
Compustat[Compustat$PERMNO==11835,]

Compustat = Compustat[!duplicated(paste0(Compustat$datadate,Compustat$PERMNO)),]


save(Compustat,file="CompustatResult.RData")
load("CompustatResult.RData")
#WoooHHOOOO --> Calculate the Concentration Ratios:

#All NAICS_4 with only three digits are NA:
Compustat$NAICS_4[Compustat$NAICS_4<1000] = NA
Compustat$NAICS_3[Compustat$NAICS_3<100] = NA
#The Herfindahl Index is simply the squared and summed up industry market share of each company in the industry

library(data.table)
setDT(Compustat)
#for three and four digit companies

#Sales per industry per year
Sale_IndustryThree = Compustat[,sum(sale),by=c("NAICS_3","fyear")]
Sale_IndustryFour = Compustat[,sum(sale),by=c("NAICS_4","fyear")]
Compustat$HHI_3 = rep(NA,nrow(Compustat))
Compustat$HHI_4 = rep(NA,nrow(Compustat))
#options(warn=2)
#Market Share of each company per year
for(i in which(!duplicated(Compustat$NAICS_3))){
  Temp = Compustat[Compustat$NAICS_3 == Compustat$NAICS_3[i],]
  #per year
  if(nrow(Temp) == 0){
    
  }else{
  for(j in 1:length(unique(Temp$fyear))){
  if(!any(is.na(Temp$NAICS_3))){
      Temp$HHI_3[Temp$fyear==unique(Temp$fyear)[j]] = (100*Temp$sale[Temp$fyear==unique(Temp$fyear)[j]]/na.omit(Sale_IndustryThree$V1[Sale_IndustryThree$fyear==unique(Temp$fyear)[j] & Sale_IndustryThree$NAICS_3==unique(Temp$NAICS_3)]))^2
      }else{
      Temp$HHI_3[Temp$fyear==unique(Temp$fyear)[j]] = NA
      }
  }
  Compustat$HHI_3[Compustat$NAICS_3 == Compustat$NAICS_3[i] & !is.na(Compustat$NAICS_3)] = Temp$HHI_3
  }
  print(i)
}

for(i in which(!duplicated(Compustat$NAICS_4))){
  Temp = Compustat[Compustat$NAICS_4 == Compustat$NAICS_4[i],]
  if(nrow(Temp) == 0){
    
  }else{
  #per year
  for(j in 1:length(unique(Temp$fyear))){
    if(!any(is.na(Temp$NAICS_4))){
      Temp$HHI_4[Temp$fyear==unique(Temp$fyear)[j]] = (100*Temp$sale[Temp$fyear==unique(Temp$fyear)[j]]/na.omit(Sale_IndustryFour$V1[Sale_IndustryFour$fyear==unique(Temp$fyear)[j] & Sale_IndustryFour$NAICS_4==unique(Temp$NAICS_4)]))^2
    }else{
      Temp$HHI_4[Temp$fyear==unique(Temp$fyear)[j]] = NA
    }
  }
  Compustat$HHI_4[Compustat$NAICS_4 == Compustat$NAICS_4[i]& !is.na(Compustat$NAICS_4)] = Temp$HHI_4
  }
  print(i)
}
library(lubridate)
Compustat$datadate <- Compustat$datadate %m+% months(4)
Compustat$fyear <- as.numeric(format(Compustat$datadate,"%Y"))

#Sum up the market share over all companies within the industry each year
HHI_3 = Compustat[,sum(HHI_3),by=c("NAICS_3","fyear")]
HHI_3$V2 = Compustat[,length(HHI_3),by=c("NAICS_3","fyear")]$V1
HHI_3$Sale = Compustat[,sum(sale),by=c("NAICS_3","fyear")]$V1


HHI_4 = Compustat[,sum(HHI_4),by=c("NAICS_4","fyear")]
HHI_4$V2 = Compustat[,length(HHI_4),by=c("NAICS_4","fyear")]$V1
HHI_4$Sale = Compustat[,sum(sale),by=c("NAICS_4","fyear")]$V1

#For one factor, value-weight the HHI indices of each industry
HHI_3 <- HHI_3[order(HHI_3$fyear),]
HHI_3 <- HHI_3[order(HHI_3$NAICS_3),]
k = 1
plot(HHI_3$fyear[HHI_3$NAICS_3 == unique(HHI_3$NAICS_3)[k]],
     HHI_3$V1[HHI_3$NAICS_3 == unique(HHI_3$NAICS_3)[k]],type="l")

HHI_4 <- HHI_4[order(HHI_4$fyear),]
HHI_4 <- HHI_4[order(HHI_4$NAICS_4),]

plot(HHI_4$fyear[HHI_4$NAICS_4 == unique(HHI_4$NAICS_4)[k]],
     HHI_4$V1[HHI_4$NAICS_4 == unique(HHI_4$NAICS_4)[k]],type="l")

HHI_3 <- HHI_3[order(HHI_3$fyear),]
#Sum of Sale
Industry_Sum <- HHI_3[,sum(Sale,na.rm=T),by=fyear]
HHI_3$Total_Sales = rep(NA,nrow(HHI_3))
for(i in 1:nrow(Industry_Sum)){
HHI_3$Total_Sales[HHI_3$fyear == Industry_Sum$fyear[i]] = rep(Industry_Sum$V1[i],sum(HHI_3$fyear == Industry_Sum$fyear[i]))
}

Herfindahl <- HHI_3[,sum(V1*(Sale/Total_Sales),na.rm=T),by=fyear]

plot(Herfindahl[Herfindahl$fyear>1971,],type="l")

HHI_4 <- HHI_4[order(HHI_4$fyear),]
#Sum of Sale
Industry_Sum <- HHI_4[,sum(Sale,na.rm=T),by=fyear]
HHI_4$Total_Sales = rep(NA,nrow(HHI_4))
for(i in 1:nrow(Industry_Sum)){
  HHI_4$Total_Sales[HHI_4$fyear == Industry_Sum$fyear[i]] = rep(Industry_Sum$V1[i],sum(HHI_4$fyear == Industry_Sum$fyear[i]))
}

Herfindahl <- HHI_4[,sum(V1*(Sale/Total_Sales),na.rm=T),by=fyear]
plot(Herfindahl[Herfindahl$fyear>1985,],type="l",ylab="HHI",main="")

#Mean Firm Size:
#Seems to match okay (...)
plot(Compustat[,length(sale),by=fyear][order(Compustat[,length(sale),by=fyear]$fyear),],type="l")

save(HHI_3,file="HHI_3.RData")
save(HHI_4,file="HHI_4.RData")

#HHI_3 NAICS_3 Code with Sales
